{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "1e25a039",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/gpt2-117m.tar.gz\n",
    "# !tar -zxf gpt2-117m.tar.gz\n",
    "# !rm gpt2-117m.tar.gz\n",
    "# !wget https://huggingface.co/huseinzol05/bpe/resolve/main/gpt2-encoder.json\n",
    "# !wget https://huggingface.co/huseinzol05/bpe/resolve/main/gpt2-vocab.bpe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7b581a33",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ad592b1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "x = {\n",
    "    \"n_vocab\": 50257,\n",
    "    \"n_ctx\": 1024,\n",
    "    \"n_embd\": 768,\n",
    "    \"n_head\": 12,\n",
    "    \"n_layer\": 12\n",
    "}\n",
    "with open('config.json', 'w') as fopen:\n",
    "    json.dump(x, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "12cbae66",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checkpoint\t frozen_model.pb.quantized\t model.ckpt.index\r\n",
      "frozen_model.pb  model.ckpt.data-00000-of-00001  model.ckpt.meta\r\n"
     ]
    }
   ],
   "source": [
    "!ls gpt2-117m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e1328b7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir gpt2-117m-pt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c81cd11f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converting TensorFlow checkpoint from /home/husein/malaya/gpt2-117m/model.ckpt\n",
      "Loading TF weight model/h0/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h0/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h0/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h0/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h0/ln_1/b with shape [768]\n",
      "Loading TF weight model/h0/ln_1/g with shape [768]\n",
      "Loading TF weight model/h0/ln_2/b with shape [768]\n",
      "Loading TF weight model/h0/ln_2/g with shape [768]\n",
      "Loading TF weight model/h0/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h0/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h0/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h0/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h1/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h1/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h1/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h1/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h1/ln_1/b with shape [768]\n",
      "Loading TF weight model/h1/ln_1/g with shape [768]\n",
      "Loading TF weight model/h1/ln_2/b with shape [768]\n",
      "Loading TF weight model/h1/ln_2/g with shape [768]\n",
      "Loading TF weight model/h1/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h1/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h1/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h1/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h10/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h10/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h10/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h10/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h10/ln_1/b with shape [768]\n",
      "Loading TF weight model/h10/ln_1/g with shape [768]\n",
      "Loading TF weight model/h10/ln_2/b with shape [768]\n",
      "Loading TF weight model/h10/ln_2/g with shape [768]\n",
      "Loading TF weight model/h10/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h10/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h10/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h10/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h11/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h11/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h11/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h11/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h11/ln_1/b with shape [768]\n",
      "Loading TF weight model/h11/ln_1/g with shape [768]\n",
      "Loading TF weight model/h11/ln_2/b with shape [768]\n",
      "Loading TF weight model/h11/ln_2/g with shape [768]\n",
      "Loading TF weight model/h11/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h11/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h11/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h11/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h2/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h2/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h2/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h2/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h2/ln_1/b with shape [768]\n",
      "Loading TF weight model/h2/ln_1/g with shape [768]\n",
      "Loading TF weight model/h2/ln_2/b with shape [768]\n",
      "Loading TF weight model/h2/ln_2/g with shape [768]\n",
      "Loading TF weight model/h2/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h2/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h2/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h2/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h3/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h3/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h3/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h3/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h3/ln_1/b with shape [768]\n",
      "Loading TF weight model/h3/ln_1/g with shape [768]\n",
      "Loading TF weight model/h3/ln_2/b with shape [768]\n",
      "Loading TF weight model/h3/ln_2/g with shape [768]\n",
      "Loading TF weight model/h3/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h3/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h3/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h3/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h4/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h4/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h4/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h4/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h4/ln_1/b with shape [768]\n",
      "Loading TF weight model/h4/ln_1/g with shape [768]\n",
      "Loading TF weight model/h4/ln_2/b with shape [768]\n",
      "Loading TF weight model/h4/ln_2/g with shape [768]\n",
      "Loading TF weight model/h4/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h4/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h4/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h4/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h5/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h5/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h5/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h5/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h5/ln_1/b with shape [768]\n",
      "Loading TF weight model/h5/ln_1/g with shape [768]\n",
      "Loading TF weight model/h5/ln_2/b with shape [768]\n",
      "Loading TF weight model/h5/ln_2/g with shape [768]\n",
      "Loading TF weight model/h5/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h5/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h5/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h5/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h6/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h6/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h6/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h6/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h6/ln_1/b with shape [768]\n",
      "Loading TF weight model/h6/ln_1/g with shape [768]\n",
      "Loading TF weight model/h6/ln_2/b with shape [768]\n",
      "Loading TF weight model/h6/ln_2/g with shape [768]\n",
      "Loading TF weight model/h6/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h6/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h6/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h6/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h7/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h7/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h7/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h7/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h7/ln_1/b with shape [768]\n",
      "Loading TF weight model/h7/ln_1/g with shape [768]\n",
      "Loading TF weight model/h7/ln_2/b with shape [768]\n",
      "Loading TF weight model/h7/ln_2/g with shape [768]\n",
      "Loading TF weight model/h7/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h7/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h7/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h7/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h8/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h8/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h8/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h8/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h8/ln_1/b with shape [768]\n",
      "Loading TF weight model/h8/ln_1/g with shape [768]\n",
      "Loading TF weight model/h8/ln_2/b with shape [768]\n",
      "Loading TF weight model/h8/ln_2/g with shape [768]\n",
      "Loading TF weight model/h8/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h8/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h8/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h8/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/h9/attn/c_attn/b with shape [2304]\n",
      "Loading TF weight model/h9/attn/c_attn/w with shape [1, 768, 2304]\n",
      "Loading TF weight model/h9/attn/c_proj/b with shape [768]\n",
      "Loading TF weight model/h9/attn/c_proj/w with shape [1, 768, 768]\n",
      "Loading TF weight model/h9/ln_1/b with shape [768]\n",
      "Loading TF weight model/h9/ln_1/g with shape [768]\n",
      "Loading TF weight model/h9/ln_2/b with shape [768]\n",
      "Loading TF weight model/h9/ln_2/g with shape [768]\n",
      "Loading TF weight model/h9/mlp/c_fc/b with shape [3072]\n",
      "Loading TF weight model/h9/mlp/c_fc/w with shape [1, 768, 3072]\n",
      "Loading TF weight model/h9/mlp/c_proj/b with shape [768]\n",
      "Loading TF weight model/h9/mlp/c_proj/w with shape [1, 3072, 768]\n",
      "Loading TF weight model/ln_f/b with shape [768]\n",
      "Loading TF weight model/ln_f/g with shape [768]\n",
      "Loading TF weight model/wpe with shape [1024, 768]\n",
      "Loading TF weight model/wte with shape [50257, 768]\n",
      "Initialize PyTorch weight ['h0', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h0', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h0', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h0', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h0', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h0', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h0', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h0', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h0', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h0', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h0', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h0', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h1', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h1', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h1', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h1', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h1', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h1', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h1', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h1', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h1', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h1', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h1', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h1', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h10', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h10', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h10', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h10', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h10', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h10', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h10', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h10', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h10', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h10', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h10', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h10', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h11', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h11', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h11', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h11', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h11', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h11', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h11', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h11', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h11', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h11', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h11', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h11', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h2', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h2', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h2', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h2', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h2', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h2', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h2', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h2', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h2', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h2', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h2', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h2', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h3', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h3', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h3', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h3', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h3', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h3', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h3', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h3', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h3', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h3', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h3', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h3', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h4', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h4', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h4', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h4', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h4', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h4', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h4', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h4', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h4', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h4', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h4', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h4', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h5', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h5', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h5', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h5', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h5', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h5', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h5', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h5', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h5', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h5', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h5', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h5', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h6', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h6', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h6', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h6', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h6', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h6', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h6', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h6', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h6', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h6', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h6', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h6', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h7', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h7', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h7', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h7', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h7', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h7', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h7', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h7', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h7', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h7', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h7', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h7', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h8', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h8', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h8', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h8', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h8', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h8', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h8', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h8', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h8', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h8', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h8', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h8', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h9', 'attn', 'c_attn', 'b']\n",
      "Initialize PyTorch weight ['h9', 'attn', 'c_attn', 'w']\n",
      "Initialize PyTorch weight ['h9', 'attn', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h9', 'attn', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['h9', 'ln_1', 'b']\n",
      "Initialize PyTorch weight ['h9', 'ln_1', 'g']\n",
      "Initialize PyTorch weight ['h9', 'ln_2', 'b']\n",
      "Initialize PyTorch weight ['h9', 'ln_2', 'g']\n",
      "Initialize PyTorch weight ['h9', 'mlp', 'c_fc', 'b']\n",
      "Initialize PyTorch weight ['h9', 'mlp', 'c_fc', 'w']\n",
      "Initialize PyTorch weight ['h9', 'mlp', 'c_proj', 'b']\n",
      "Initialize PyTorch weight ['h9', 'mlp', 'c_proj', 'w']\n",
      "Initialize PyTorch weight ['ln_f', 'b']\n",
      "Initialize PyTorch weight ['ln_f', 'g']\n",
      "Initialize PyTorch weight ['wpe']\n",
      "Initialize PyTorch weight ['wte']\n",
      "Save PyTorch model to gpt2-117m-pt/pytorch_model.bin\n",
      "Save configuration file to gpt2-117m-pt/config.json\n"
     ]
    }
   ],
   "source": [
    "!transformers-cli convert --model_type gpt2 \\\n",
    "  --tf_checkpoint gpt2-117m/model.ckpt \\\n",
    "  --pytorch_dump_output gpt2-117m-pt \\\n",
    "  --config config.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "cf59383f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "df1ec269",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = GPT2LMHeadModel.from_pretrained('./gpt2-117m-pt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "075d5c93",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('gpt2-117m-pt/tokenizer_config.json',\n",
       " 'gpt2-117m-pt/special_tokens_map.json',\n",
       " 'gpt2-117m-pt/vocab.json',\n",
       " 'gpt2-117m-pt/merges.txt',\n",
       " 'gpt2-117m-pt/added_tokens.json')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = GPT2Tokenizer('gpt2-encoder.json', 'gpt2-vocab.bpe')\n",
    "tokenizer.save_pretrained('gpt2-117m-pt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d342dd4b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[  77, 1228,  571,  374, 1031,  461,  512,  282,  993,  384,  273,  648]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.encode('najib razak adalah seorang', return_tensors='pt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "248d2a70",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    }
   ],
   "source": [
    "g = model.generate(tokenizer.encode('Najib Razak adalah seorang', return_tensors='pt'),\n",
    "                  do_sample=True, \n",
    "    max_length=100, \n",
    "    top_p=0.95, \n",
    "    top_k=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "2d046f76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Najib Razak adalah seorang perisik dalam perangkap berterusan antara Israel dan Syria.\\nBerikutan kemerdekaan pada 24 September 2011, Mahathir mengisytiharkan kemerdekaan dari Syria kepada Presiden Bashar al-Assad.\\nPada 18 September 2011, Najib dibebaskan dengan parol dalam masa sejam ketika d'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(g[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "4afffc44",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:38: FutureWarning: Deprecated positional argument(s) used in 'create_repo': pass token='gpt2-117m-bahasa-cased' as keyword args. From version 0.12 passing these as positional arguments will result in an error,\n",
      "  warnings.warn(\n",
      "/home/husein/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:102: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.10. Pass `repo_id` instead.\n",
      "  warnings.warn(\n",
      "/home/husein/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:681: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!\n",
      "  warnings.warn(\n",
      "Cloning https://huggingface.co/mesolitica/gpt2-117m-bahasa-cased into local empty directory.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "08b4e5dfeed947b387fe80f42d31f90b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "remote: Scanning LFS files for validity, may be slow...        \n",
      "remote: LFS file scan complete.        \n",
      "To https://huggingface.co/mesolitica/gpt2-117m-bahasa-cased\n",
      "   3405d93..2a3cf5d  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/gpt2-117m-bahasa-cased/commit/2a3cf5d0367ab46059d36ad5e807031fd1fd3dc9'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.push_to_hub('gpt2-117m-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "71b4380d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "To https://huggingface.co/mesolitica/gpt2-117m-bahasa-cased\n",
      "   2a3cf5d..77050de  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/gpt2-117m-bahasa-cased/commit/77050defadec5b18373e3e9d58c669583bc5fa66'"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('gpt2-117m-bahasa-cased', organization='mesolitica')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
